library(ggpubr)
Loading required package: magrittr
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
Attaching package: ‘ggpubr’
The following object is masked from ‘package:cowplot’:
get_legend
f74.rna.seu@meta.data <- f74.rna.seu@meta.data %>%
rownames_to_column("cell") %>%
left_join(annotation.df) %>%
column_to_rownames("cell")
Joining, by = "cell"
First I have made the snap file from the cellranger fragments.tsv file. Script is multiOmic_benchmark/preprocess/fragments2snap.sh.
Following integration vignette
Barcode selection
Filtering based on number of reads per cell and ratio of fragments that are within promoters

barcode.ls = lapply(seq(snap.files), function(i){
barcodes = barcode.ls[[i]];
idx = which(
barcodes$logUMI >= cutoff.logUMI.low[i] &
barcodes$logUMI <= cutoff.logUMI.high[i] &
barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
barcodes$promoter_ratio <= cutoff.FRIP.high[i]
);
barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
barcodes = barcode.ls[[i]];
x.sp = x.sp.ls[[i]];
barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
x.sp@metaData = barcodes;
x.sp
})
names(x.sp.ls) = sample.names;
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["sample"] = x.sp@sample;
x.sp
number of barcodes: 6043
number of bins: 0
number of genes: 0
number of peaks: 0
number of motifs: 0
Add cell-by-bin matrix
x.sp = addBmatToSnap(x.sp, bin.size = 5000)
Epoch: reading cell-bin count matrix session ...
Binarize matrix
Some items in the count matrix have abnormally high coverage perhaps due to the alignment errors. Therefore, we next remove top 0.1% items in the count matrix and then convert the remaining non-zero values to 1.
x.sp = makeBinary(x.sp, mat="bmat")
x.sp
number of barcodes: 6043
number of bins: 651791
number of genes: 0
number of peaks: 0
number of motifs: 0
Filter bins
Filter out bins overlapping w ENCODE blacklist
black_list = read.table("~/annotations/hg38.blacklist.bed.gz")
black_list.gr = GRanges(
black_list[,1],
IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp
number of barcodes: 6043
number of bins: 651765
number of genes: 0
number of peaks: 0
number of motifs: 0
Exclude bad chromosomes
chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))]
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature)
if(length(idy) > 0){
x.sp = x.sp[,-idy, mat="bmat"]
}
x.sp
number of barcodes: 6043
number of bins: 650342
number of genes: 0
number of peaks: 0
number of motifs: 0
remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters
bin.cov = log10(Matrix::colSums(x.sp@bmat)+1)
# bin.cov = Matrix::colSums(x.sp@bmat)
hist(
bin.cov[bin.cov > 0],
xlab="log10(bin cov)",
main="log10(Bin Cov)",
col="lightblue",
# xlim=c(0, 5),
breaks=100
);

bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95)
idy = which(bin.cov <= bin.cutoff & bin.cov > 0)
x.sp = x.sp[, idy, mat="bmat"];
x.sp
number of barcodes: 6043
number of bins: 539584
number of genes: 0
number of peaks: 0
number of motifs: 0
Remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommanded.
idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
number of barcodes: 5793
number of bins: 539584
number of genes: 0
number of peaks: 0
number of motifs: 0
Dimensionality reduction
Uses diffusion map algorithm w sampling technique to make it fast.
## Sample 100 cells as landmarks
row.covs.dens <- density(
x = x.sp@metaData[,"logUMI"],
bw = 'nrd', adjust = 1
)
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1)
idx.landmark.ds <- sort(sample(x = seq(nrow(x.sp)), size = 1000, prob = sampling_prob))
## Split between landmark and query cells
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];
## Run diffusion map on landmark
x.landmark.sp = runDiffusionMaps(
obj= x.landmark.sp,
input.mat="bmat",
num.eigs=50
);
Epoch: checking the inputs ...
Epoch: computing jaccard similarity matrix ...
Epoch: fitting regression model ...
Epoch: performing normalization ...
Epoch: computing eigen decomposition ...
Epoch: Done
x.landmark.sp@metaData$landmark = 1;
## Project query cells
x.query.sp = runDiffusionMapsExtension(
obj1=x.landmark.sp,
obj2=x.query.sp,
input.mat="bmat"
)
Epoch: checking the inputs ...
Epoch: computing jaccard similarity matrix ...
Epoch: performing normalization ...
Epoch: projecting query cells to the reference ...
Epoch: Done
x.query.sp@metaData$landmark = 0;
## Combine
x.sp = snapRbind(x.landmark.sp, x.query.sp);
'rBind' is deprecated.
Since R version 3.2.0, base's rbind() should work fine with S4 objects
x.sp = x.sp[order(x.sp@metaData[,"sample"])]; #IMPORTANT
To determine significant diffusion components: > We use an ad hoc method by simply looking at a pairwise plot and select the number of eigen vectors that the scatter plot starts looking like a blob. In the below example, we choose the first 15 eigen vectors.
plotDimReductPW(
obj=x.sp,
eigs.dims=1:50,
point.size=0.3,
point.color="grey",
point.shape=19,
point.alpha=0.6,
down.sample=5000,
pdf.file.name=NULL,
pdf.height=7,
pdf.width=7
);

Clustering and visualization
x.sp=runCluster(
obj=x.sp,
tmp.folder=tempdir(),
louvain.lib="leiden",
seed.use=10,
resolution=0.7
);
Epoch: checking input parameters
Epoch: finding clusters using leiden
Visualization

Make gene matrix
saveRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")
Error in saveRDS("~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS") :
'file' must be non-empty string
Repeat visualization on gmat

FeaturePlot(f74.seu, features = c('PTPRC','CD4','CD8A','CD8B','CD79A','FOXN1','EPCAM','PDGFRA','GNG4', 'FOXP3','RAG1','RAG2','NKG7','CCR7'), reduction = "umap.snap")
All cells have the same value (0) of CD79A.All cells have the same value (0) of RAG2.All cells have the same value (0) of NKG7.All cells have the same value (0) of CCR7.
DimHeatmap(f74.seu, dims = 1:6, balanced = TRUE,ncol = 1)


31 Oct 2019 09:28:40 [rsession-] CLIENT EXCEPTION (rsession-jovyan): (TypeError) : Cannot read property 'a' of null;
org/rstudio/studio/client/workbench/views/source/editors/text/ChunkPlotPage.java#52::execute
org/rstudio/studio/client/workbench/views/source/editors/text/ChunkPlotWidget.java#49::onBrowserEvent
com/google/gwt/user/client/DOM.java#1414::dispatchEvent
com/google/gwt/user/client/impl/DOMImplStandard.java#312::dispatchEvent
com/google/gwt/user/client/impl/DOMImplStandard.java#334::dispatchUnhandledEvent
com/google/gwt/core/client/impl/Impl.java#244::apply
com/google/gwt/core/client/impl/Impl.java#283::entry0
rstudio-0.js#-1::eval
Client-ID: 7beb3877-891d-4e16-98d5-f8cca1f72d5d
User-Agent: Mozilla/5.0 (Macintosh Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36

Differential analysis

Expression of markers in RNA data
map(1:10, ~ plot_cluster_marker_ATACvsRNA(.x))
All cells have the same value (0) of MIR4300HG.All cells have the same value (0) of NOS1.All cells have the same value (0) of RPH3A.Cannot convert object of class listggarrange into a grob.All cells have the same value (0) of C5orf66-AS2.All cells have the same value (0) of CYP3A43.
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
[[9]]
[[10]]










Embedding on most variable genes in ATAC (not in gene expression)

Thoughts
- There seems to be a correspondance for gene expression and accessibility in many “accessibility markers” (see Cluster 6) –> maybe integration would work better if I don’t take only the most variable features in the RNA data.
---
title: "Snap ATAC data processing"
output: html_notebook
---

```{r}
suppressPackageStartupMessages({
  library(SnapATAC)
  library(leiden)
  library(umap)
  library(GenomicRanges)
  library(zoo)
  library(tidyverse)
  library(cowplot)
  library(ggpubr)
})
source("~/multiOmic_benchmark/preprocess/selectFeatures.R")
```
```{r}
## Load and embed RNA data
f74.sce.list <- readRDS("~/my_data/integrated_thymus/F74_SCElist_20191017.RDS")
f74.rna <- f74.sce.list$RNA

f74.rna.seu <- as.Seurat(f74.rna, verbose=FALSE)
integrate_features <- HVG_Seurat(f74.rna, nfeatures = 3000)
VariableFeatures(f74.rna.seu) <- integrate_features
f74.rna.seu <- ScaleData(f74.rna.seu, do.center = TRUE, verbose=FALSE)
f74.rna.seu <- RunPCA(f74.rna.seu) 
f74.rna.seu <- RunUMAP(f74.rna.seu, reduction = "pca", dims=1:50) 

## Add cell type annotation
annotation.df <- read.csv("~/my_data/F74_RNA_obs.csv")
annotation.df <- annotation.df %>%
  mutate(cell=str_remove(as.character(X), "F74_1_") %>% str_c(ifelse(batch==0,'_1', "_2"))) 

f74.rna.seu@meta.data <- f74.rna.seu@meta.data %>%
  rownames_to_column("cell") %>%
  left_join(annotation.df) %>%
  column_to_rownames("cell")

```


First I have made the snap file from the cellranger `fragments.tsv` file. Script is `multiOmic_benchmark/preprocess/fragments2snap.sh`.

Following [integration vignette](https://github.com/r3fang/SnapATAC/blob/master/examples/10X_PBMC_15K/README.md) 

### Barcode selection
Filtering based on number of reads per cell and ratio of fragments that are within promoters
```{r}
snap.files <- "~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snap"
sample.names <- "F74"
barcode.files <- '~/my_data/singlecell.csv'

x.sp.ls <- list(createSnap(snap.files, sample.names))
names(x.sp.ls) = sample.names
x.sp.ls

barcode.ls = lapply(seq(snap.files), function(i){
    barcodes = read.csv(
        barcode.files[i], 
        head=TRUE
    );
    # remove NO BAROCDE line
    barcodes = barcodes[2:nrow(barcodes),];
    barcodes$logUMI = log10(barcodes$passed_filters + 1);
    barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1);
    barcodes
  })

plots = lapply(seq(snap.files), function(i){
    p1 = ggplot(
        barcode.ls[[i]], 
        aes(x=logUMI, y=promoter_ratio)) + 
        geom_point(size=0.3, col="grey") +
        theme_classic()	+
        ggtitle(sample.names[[i]]) +
        ylim(0, 1) + xlim(0, 6) + 
        labs(x = "log10(counts)", y="promoter ratio")
        p1
    })

## Select and viz cutoffs
cutoff.logUMI.low = 3.3
cutoff.logUMI.high = 4.8
cutoff.FRIP.low = 0.25
cutoff.FRIP.high = 0.65


plots[[1]] + 
  geom_vline(xintercept = c(cutoff.logUMI.low[1],cutoff.logUMI.high[1]), linetype=2) +
  geom_hline(yintercept = c(cutoff.FRIP.low[1],cutoff.FRIP.high[1]), linetype=2)
```
```{r}
barcode.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  idx = which(
      barcodes$logUMI >= cutoff.logUMI.low[i] & 
      barcodes$logUMI <= cutoff.logUMI.high[i] & 
      barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
      barcodes$promoter_ratio <= cutoff.FRIP.high[i]
  );
  barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  x.sp = x.sp.ls[[i]];
  barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
  x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
  barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
  x.sp@metaData = barcodes;
  x.sp
})
names(x.sp.ls) = sample.names;
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["sample"] = x.sp@sample;
x.sp
```

### Add cell-by-bin matrix
```{r}
x.sp = addBmatToSnap(x.sp, bin.size = 5000)
```
## Binarize matrix
Some items in the count matrix have abnormally high coverage perhaps due to the alignment errors. Therefore, we next remove top 0.1% items in the count matrix and then convert the remaining non-zero values to 1.
```{r}
x.sp = makeBinary(x.sp, mat="bmat")
x.sp
```

## Filter bins
Filter out bins overlapping w ENCODE blacklist

```{r}
black_list = read.table("~/annotations/hg38.blacklist.bed.gz")
black_list.gr = GRanges(
  black_list[,1], 
  IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
  findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp
```

Exclude bad chromosomes
```{r}
chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))]
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature)
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"]
}
x.sp
```

remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters

```{r}
bin.cov = log10(Matrix::colSums(x.sp@bmat)+1)
# bin.cov = Matrix::colSums(x.sp@bmat)
hist(
  bin.cov[bin.cov > 0], 
  xlab="log10(bin cov)", 
  main="log10(Bin Cov)", 
  col="lightblue", 
  # xlim=c(0, 5),
  breaks=100
);
bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95)
idy = which(bin.cov <= bin.cutoff & bin.cov > 0)
x.sp = x.sp[, idy, mat="bmat"];
x.sp
```

Remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommanded.
```{r}
idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
```

## Dimensionality reduction
Uses diffusion map algorithm w sampling technique to make it fast.

```{r}
## Sample 100 cells as landmarks 
row.covs.dens <- density(
  x = x.sp@metaData[,"logUMI"], 
  bw = 'nrd', adjust = 1
)
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1)
idx.landmark.ds <- sort(sample(x = seq(nrow(x.sp)), size = 1000, prob = sampling_prob))

## Split between landmark and query cells
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];

## Run diffusion map on landmark
x.landmark.sp = runDiffusionMaps(
  obj= x.landmark.sp,
  input.mat="bmat", 
  num.eigs=50
);
x.landmark.sp@metaData$landmark = 1;

## Project query cells
x.query.sp = runDiffusionMapsExtension(
  obj1=x.landmark.sp, 
  obj2=x.query.sp,
  input.mat="bmat"
)
x.query.sp@metaData$landmark = 0;

## Combine 
x.sp = snapRbind(x.landmark.sp, x.query.sp);
x.sp = x.sp[order(x.sp@metaData[,"sample"])]; #IMPORTANT
```

To determine significant diffusion components:
> We use an ad hoc method by simply looking at a pairwise plot and select the number of eigen vectors that the scatter plot starts looking like a blob. In the below example, we choose the first 15 eigen vectors.

```{r}
plotDimReductPW(
    obj=x.sp, 
    eigs.dims=1:50,
    point.size=0.3,
    point.color="grey",
    point.shape=19,
    point.alpha=0.6,
    down.sample=5000,
    pdf.file.name=NULL, 
    pdf.height=7, 
    pdf.width=7
  );
```

## Clustering and visualization
```{r}
signif.dims = 1:16
x.sp = runKNN(
    obj=x.sp,
    eigs.dims=signif.dims,
    k=15
  );

x.sp=runCluster(
    obj=x.sp,
    tmp.folder=tempdir(),
    louvain.lib="leiden",
    seed.use=10,
    resolution=0.7
  );
```

Visualization
```{r, fig.height=5, fig.width=5}
x.sp = runViz(
 obj=x.sp, 
 tmp.folder=tempdir(),
 dims=2,
 eigs.dims=signif.dims, 
 method="umap",
 seed.use=10
)

plotViz(
    obj= x.sp,
    method="umap", 
    main="Cluster",
    point.color=x.sp@cluster, 
    point.size=0.2, 
    point.shape=19, 
    text.add=TRUE,
    text.size=1,
    text.color="black",
    down.sample=10000,
    legend.add=FALSE
  );

```


<!-- ## Outlier analysis -->
<!-- ```{r} -->
<!-- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   # arrange(umap.1) -->
<!--   ggplot(aes(umap.1, umap.2)) + -->
<!--   geom_point(size=0.2) -->
<!-- ``` -->
<!-- ```{r, fig.height=5, fig.width=15} -->
<!-- outlier.ix.1 <- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   rowid_to_column() %>% -->
<!--   dplyr::filter(umap.1 > 100) %>% -->
<!--   pull(rowid) -->
<!-- outlier.ix.2 <- data.frame(x.sp@umap) %>% -->
<!--   mutate(cell=x.sp@barcode) %>% -->
<!--   rowid_to_column() %>% -->
<!--   dplyr::filter(umap.2 < -50) %>% -->
<!--   pull(rowid) -->

<!-- smp.other <- sample(setdiff(1:length(x.sp@barcode), c(outlier.ix.1, outlier.ix.2)), 100) -->

<!-- long.outlier.bmat <- -->
<!--   x.sp@bmat[c(outlier.ix.1, outlier.ix.2, smp.other),] %>% -->
<!--   as.matrix() %>% -->
<!--   melt(value.name = "acc", varnames=c("barcode", "bin")) -->

<!-- # long.outlier.bmat %>% -->
<!-- #   # dplyr::filter(barcode=='AACAAAGGTAGACGCA-1') %>% -->
<!-- #   mutate(big.bins=cut(bin,breaks = 10000)) %>% -->
<!-- #   group_by(barcode) %>% -->
<!-- #   arrange(bin) %>% -->
<!-- #   mutate(runmean=rollmean(acc,k = 10000)[1:n()]) %>% -->
<!-- #   ungroup() %>% -->
<!-- #   group_by(barcode, big.bins) %>% -->
<!-- #   summarise(bin.mean=mean(runmean)) %>% -->
<!-- #   mutate(big.bins=as.numeric(big.bins)) %>% -->
<!-- #   mutate(outlier=ifelse(barcode %in% x.sp@barcode[outlier.ix], TRUE, FALSE)) %>% -->
<!-- #   ggplot(aes( big.bins, barcode, fill=bin.mean)) + -->
<!-- #   geom_tile() + -->
<!-- #   scale_fill_viridis_c() + -->
<!-- #   facet_wrap(outlier~., scales="free_y", ncol=1, nrow=2) -->
<!-- #   ungroup() %>% -->
<!-- #   ggplot(aes(bin, runmean)) + -->
<!-- #   facet_wrap(big.bins~., scale="free_x", ncol=2) + -->
<!-- #   geom_line(aes(group=barcode, color=outlier), size=0.5, alpha=0.5) -->
<!-- #    -->
<!-- ``` -->

<!-- ```{r} -->
<!-- long.outlier.bmat %>% -->
<!--   group_by(barcode) %>% -->
<!--   summarise(acc=sum(acc)) %>% -->
<!--   mutate(outlier=case_when(barcode %in% x.sp@barcode[outlier.ix.1] ~  "out1",  -->
<!--                            barcode %in% x.sp@barcode[outlier.ix.2] ~ "out2", -->
<!--                            FALSE ~ 'other')) %>% -->
<!--   # mutate(outlier=ifelse(barcode %in% x.sp@barcode[outlier.ix.2], "out2", FALSE)) %>% -->
<!--   ggplot(aes(outlier, acc)) + -->
<!--   ggbeeswarm::geom_quasirandom() -->
<!-- ``` -->


## Make gene matrix
```{r}
transcripts.gr = rtracklayer::import("~/Homo_sapiens.GRCh38.93.filtered.gtf")
colnames(transcripts.gr@elementMetadata) <- str_replace(colnames(transcripts.gr@elementMetadata), "gene_name", "name")

genes.gr <- unlist(range(split(transcripts.gr, ~ name)))  ## From transcripts to genes
genes.gr$name <- names(genes.gr)

if (GenomeInfoDb::seqlevelsStyle(genes.gr) != GenomeInfoDb::seqlevelsStyle(x.sp@feature) ) {
  GenomeInfoDb::seqlevelsStyle(genes.gr) <- GenomeInfoDb::seqlevelsStyle(x.sp@feature)
}

x.sp = createGmatFromMat(
    obj=x.sp, 
    input.mat="bmat",
    genes=genes.gr,
    do.par=TRUE,
    num.cores=10
  )

saveRDS(x.sp, file = "~/my_data/cellranger-atac110_count_30439_WSSS8038360_GRCh38-1_1_0.snapATAC.RDS")

```

Repeat visualization on gmat
```{r}
## Make suerat object
f74.seu <- snapToSeurat(
    obj=x.sp, 
    eigs.dims=1:20, 
    norm=TRUE,
    scale=TRUE
    )

## Save UMAP based on SnapATAC processing
f74.seu <- RunUMAP(f74.seu, reduction = "SnapATAC", dims=signif.dims, reduction.name = "umap.snap")


f74.seu <- FindVariableFeatures(f74.seu)
f74.seu <- RunPCA(f74.seu, dims=1:50)
f74.seu <- RunUMAP(f74.seu, reduction = "pca", dims=1:50, verbose=FALSE) 

ggpubr::ggarrange(
  DimPlot(f74.seu, reduction = "umap", group.by = NULL) + ggtitle("gmat"),
  DimPlot(f74.seu, reduction = "umap.snap", group.by = NULL) + ggtitle("bmat - snap") 
  )

```


```{r}
plotly::ggplotly(DimPlot(f74.seu, reduction = "umap", group.by = NULL) + ggtitle("gmat"))
```

```{r, fig.height=15, fig.width=15}
FeaturePlot(f74.seu, features = c('PTPRC','CD4','CD8A','CD8B','CD79A','FOXN1','EPCAM','PDGFRA','GNG4', 'FOXP3','RAG1','RAG2','NKG7','CCR7'), reduction = "umap.snap")
```

```{r, fig.height=15, fig.width=15}
DimHeatmap(f74.seu, dims = 1:6, balanced = TRUE,ncol = 1)
```

### Differential analysis 
<!-- Snap ATAC way -->
<!-- ```{r} -->
<!-- cluster = 1 -->
<!-- DARs= findDAR( -->
<!--     obj=x.sp, -->
<!--     input.mat="gmat", -->
<!--     cluster.pos=cluster, -->
<!--     # cluster.neg = 5, -->
<!--     cluster.neg.method="random", -->
<!--     test.method="exactTest", -->
<!--     bcv=0.4, #0.4 for human, 0.1 for mouse -->
<!--     seed.use=42 -->
<!--   ); -->


<!-- gmat.cl <- x.sp@gmat[which(x.sp@cluster==cluster),] -->
<!-- gmat.neg <- x.sp@gmat[which(x.sp@cluster!=cluster),] -->

<!-- hist(as.matrix(gmat.cl), breaks=50) -->


<!-- # DARs %>% -->
<!--   # mutate(gene=colnames(x.sp@gmat)) %>% -->
<!--   # arrange() %>% -->
<!--   # filter(PValue < 0.05) -->

<!-- hist(DARs$PValue, breaks = 20) -->

<!-- DARs$FDR = p.adjust(DARs$PValue, method="BH") -->
<!-- idy = which(DARs$FDR < 5e-2 & DARs$logFC > 0) -->
<!-- plot(DARs$logCPM, DARs$logFC,  -->
<!--     pch=19, cex=0.1, col="grey",  -->
<!--     ylab="logFC", xlab="logCPM", -->
<!--     main=paste("Cluster", cluster) -->
<!--   ) -->
<!-- points(DARs$logCPM[idy],  -->
<!--     DARs$logFC[idy],  -->
<!--     pch=19,  -->
<!--     cex=0.5,  -->
<!--     col="red" -->
<!--   ) -->

<!-- ``` -->
```{r}
DAGs_cl <- FindAllMarkers(f74.seu, min.pct = 0.2, logfc.threshold = 0.3, only.pos = TRUE)
DAGs_cl <- 
  DAGs_cl %>%
  mutate(positive=avg_logFC > 0)
```
```{r, fig.height=14, fig.width=9}
top10_markers <- DAGs_cl %>%
  filter(positive) %>%
  group_by(cluster) %>%
  arrange(p_val_adj) %>%
  top_n(10) %>%
  pull(gene) %>%
  unique()

top10_markers <- DAGs_cl %>%
  filter(positive) %>%
  group_by(cluster) %>%
  # arrange(p_val_adj) %>%
  top_n(10,wt = - p_val_adj) %>%
  pull(gene)

DotPlot(f74.seu, features = top10_markers) +
  coord_flip()
```

Expression of markers in RNA data

```{r, fig.width=12, fig.height=16}
plot_marker_ATACvsRNA <- function(marker){
    ggpubr::ggarrange(
      FeaturePlot(f74.seu, feature=marker, reduction="umap.snap") + ggtitle("ATAC"), FeaturePlot(f74.rna.seu, feature=marker) + ggtitle("RNA")
    ) %>% annotate_figure(top=marker)
  }

plot_cluster_marker_ATACvsRNA <- function(cl){
  top10_cluster <-
    DAGs_cl %>%
    filter(positive) %>%
    filter(cluster==cl) %>%
    filter(gene %in% rownames(f74.rna.seu)) %>%
    top_n(10,wt = - p_val_adj) %>%
    pull(gene)
  ggarrange(plotlist = map(top10_cluster, ~ plot_marker_ATACvsRNA(.x)), ncol=2, nrow=5) %>% 
    annotate_figure(fig.lab=paste("Cluster", cl), fig.lab.size = 20, fig.lab.face = "bold")
  }

map(1:10, ~ plot_cluster_marker_ATACvsRNA(.x))
```

```{r}
plotly::ggplotly(DimPlot(f74.rna.seu, group.by = "annotation"))
```


## Embedding on most variable genes in ATAC (not in gene expression)
```{r}
DefaultAssay(f74.seu) <- "ACTIVITY"
f74.seu.hvgatac <- FindVariableFeatures(f74.seu)
f74.seu.hvgatac <- RunPCA(f74.seu.hvgatac, dims=1:30)
f74.seu.hvgatac <- RunUMAP(f74.seu.hvgatac, reduction = "pca", dims=1:30, verbose=FALSE) 

f74.seu.hvgatac <- AddMetaData(f74.seu.hvgatac, x.sp@cluster, col.name='SnapATAC_cluster')
DimPlot(f74.seu.hvgatac, reduction = "umap", group.by = "SnapATAC_cluster") 
```


## Thoughts

- There seems to be a correspondance for gene expression and accessibility in many "accessibility markers" (see Cluster 6) --> maybe integration would work better if I don't take only the most variable features in the RNA data. 



